1 Synapse data

We start with observations of 793 synapses with 12 measurments formatted as a 793x12 matrix. The matrix is Z-scored column-wise and plotted in a pairs plot colored according to the labels given by the field expert.

dat <- read.csv("rorb_gaussianAvg_at.csv")
loc <- read.csv("rorb_gaussianAvg_at_orderLocations.csv")
gabaID <- read.csv("rorb_gaba.csv")
truth <- gaba <- gabaID$gaba

ccol <- c('blue', 'blue', 'blue', 'red', 'red', 'red', 'green', 'black', 'green', 'green', 'black', 'green')
ind <- order(ccol)
ccol <- sort(ccol)

dat <- dat[,ind]
sdat <- as.data.frame(scale(dat, center = TRUE, scale = TRUE))
pairs(sdat, col = gaba + 1, pch = 20, cex = 0.2,
main = "Pairs plot colored by true class")

repHeat(sdat); title("Representative heatmap")

1.1 ZG: Get Elbows

We run the Z-scored data through PCA and plot the scree-plot with the elbows as given by Zhu and Ghodsi. A pairs plot is shown on the PCA data again colored by class labels.

pc1 <- princomp(sdat)
getElbows(pc1$sdev)

pairs(pc1$scores, col = gaba +1, pch = 20, cex = 0.5)

## [1]  5  7 10

1.2 LDA and QDA

Next, applying LDA and QDA iterating on the number of PCA dimension included.

ldaL <- 
  lapply(1:12, 
         function(x){
           ldatmp <- lda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
           predict(ldatmp)
         })

qdaL <- 
  lapply(1:12, 
         function(x){
           qdatmp <- qda(gaba ~ pc1$scores[, 1:x], CV = FALSE)
           predict(qdatmp)
         })

ll <- sapply(lapply(ldaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))
ql <- sapply(lapply(qdaL, '[[', 1), function(x) sum(x != gaba)/length(gaba))

#png(file = "~/Desktop/supRes.png", 480,480)
plot(x = 1:12, y = seq(0,0.13,length = 12), type = 'n', xlab = expression(hat(d)), ylab = "L")
points(ll, type = 'b', col = 'blue')
points(ql, type = 'b', col = 'darkred')
text(10,max(ql), label = "qda", col = 'darkred')
text(10, min(ll), label = "lda", col = 'blue', pos = 1)

#dev.off()

qmm <- sapply(1:12, function(x) sum((gaba - qdaL[[x]]$post[, 1])^2))
lmm <- sapply(1:12, function(x) sum((gaba - ldaL[[x]]$post[, 1])^2))

1.3 Office Chat CEP 20180228

Data in \(M_{793 \times 12}\) is transformed using PCA with center=TRUE and scale=TRUE to \(D_{793 \times 12}\). Class labels, \(Y_{i\in [793]} \in {0,1}\) have been given by the field experts (quite possibly errorful) with proportions 708 and 85 out of 793 of non-gaba and gaba, respectively.

Let \(n_0, n_1\) be the numbers of points in class 0 and 1, respectively.

n0 <- 708
n1 <- 85

params <- list()
for(i in 1:12){
  params[[i]] <- list()
  for(j in unique(gaba)){
    params[[i]][[as.character(j)]] <- 
      list( 
           means = colMeans(as.matrix(pc1$scores[gaba == j, 1:i])), 
           sigma   = if(i == 1){
             sd(pc1$scores[gaba == j, 1:i])
           } else {
                  cov(pc1$scores[gaba == j, 1:i])
                  }
           )
  }}

1.4 Sampling

mont <- 500
truth  <- c(rep(0, n0), rep(1,n1))

s <- list()
for(mi in 1:mont){
set.seed(mi)
    s[[mi]] <- rbind( 
               rmvnorm(n0, mean = params[[12]]$'0'$means, sigma = params[[12]]$'0'$sigma),
               rmvnorm(n1, mean = params[[12]]$'1'$means, sigma = params[[12]]$'1'$sigma)
              )
}
        
d1 <- foreach(j = 1:12) %:% 
      foreach(x = 1:mont, .combine = 'rbind') %do% {
         
        qdaR <- qda(truth ~ s[[x]][, 1:j], CV = FALSE)
        rcl <- as.numeric(predict(qdaR)$class) - 1
        
        qdaD <- qda(truth ~ s[[x]][, 1:j], CV = TRUE)
        
        (Lr <- sum(rcl != truth)/length(truth))
        (Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
        #assign(paste0("Lr", j), Lr)
        #assign(paste0("Ld", j), Ld)

        #cbind(
        #      eval(as.symbol(paste0("Lr", j))), 
        #      eval(as.symbol(paste0("Ld", j)))
        #      )
        cbind(Lr, Ld)
}

tmp <- cbind(Reduce(rbind, d1))
tmp1 <- as.data.frame(cbind(data.table::melt(tmp)))

tmp1$Var1 <- as.factor(rep(1:12, each = mont))
tmp1$Var2 <- as.factor(tmp1$Var2)
names(tmp1) <- c("dim", "L", "value")
p <- ggplot(data = tmp1, aes(x = L, y = value, fill = dim)) + 
     geom_boxplot(notch = TRUE) + facet_grid(. ~ dim)
plot(p)

dLines <- 
foreach(mi = 1:mont) %:%
foreach(x = 1:12, .combine = 'rbind') %dopar% {
  qdaR <- qda(truth ~ s[[mi]][, 1:x], CV = FALSE)
  rcl <- as.numeric(predict(qdaR)$class) - 1
        
  qdaD <- qda(truth ~ as.matrix(s[[x]][, 1:j]), CV = TRUE)
        
  (Lr <- sum(rcl != truth)/length(truth))
  (Ld <- sum((as.numeric(qdaD$class) - 1) != truth)/length(truth))
  cbind(Lr, Ld, mi, x)
}

lineDat <- Reduce('rbind', dLines)
pLines <- ggplot(as.data.frame(lineDat), aes(x = x, group = mi)) + geom_line(aes(y = Lr, color = mi)) 
show(pLines)

show(p1sig)
## notch went outside hinges. Try setting notch=FALSE.